Lab 13

Author

Hewan Weldai

From Previous Labs

In effort to analyze data from a table of Metagenome Assembled genomes (MAGS) from sites a part of the National Ecological Observatory Network, I used UMass Copilot AI to help create an R script.

In my previous efforts, I was able to create a first create the combined csv file with all of the correct data, load this file into R, and download the correct packages for these different analyses.

# Load libraries
library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)
library(tidyverse)
# Read in your dataset
df <- read_csv("NEON_soilMAGs_soilChem.csv")

# Inspect column names
glimpse(df)
Rows: 5,573
Columns: 43
$ ...1                  <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
$ genomicsSampleID      <chr> "BONA_001-O-20230710", "BONA_001-O-20230710", "B…
$ soilMoisture          <dbl> 1.546, 1.546, 1.546, 1.546, 1.546, 1.546, 1.546,…
$ decimalLatitude       <dbl> 65.17445, 65.17445, 65.17445, 65.17445, 65.17445…
$ decimalLongitude      <dbl> -147.4782, -147.4782, -147.4782, -147.4782, -147…
$ elevation             <dbl> 374.1, 374.1, 374.1, 374.1, 374.1, 374.1, 374.1,…
$ soilTemp              <dbl> 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1…
$ soilInWaterpH         <dbl> 3.993976, 3.993976, 3.993976, 3.993976, 3.993976…
$ soilInCaClpH          <dbl> 3.449951, 3.449951, 3.449951, 3.449951, 3.449951…
$ bin_oid               <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ bin_id                <chr> "3300079481_s0", "3300079481_s1", "3300079481_s1…
$ site                  <chr> "Caribou-Poker Creeks Research Watershed NEON Fi…
$ site_ID               <chr> "BONA", "BONA", "BONA", "BONA", "BONA", "BONA", …
$ subplot               <chr> "001", "001", "001", "001", "001", "001", "001",…
$ layer                 <chr> "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"…
$ quadrant              <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ date                  <dbl> 20230710, 20230710, 20230710, 20230710, 20230710…
$ img_genome_id         <dbl> 3300079481, 3300079481, 3300079481, 3300079481, …
$ bin_quality           <chr> "MQ", "HQ", "MQ", "MQ", "MQ", "MQ", "MQ", "MQ", …
$ bin_lineage           <chr> "Bacteria; Acidobacteriota; Terriglobia", "Bacte…
$ gtdb_taxonomy_lineage <chr> "Bacteria; Acidobacteriota; Terriglobia; Terrigl…
$ domain                <chr> "Bacteria", "Bacteria", "Bacteria", "Bacteria", …
$ phylum                <chr> "Acidobacteriota", "Actinomycetota", "Actinomyce…
$ class                 <chr> "Terriglobia", "Thermoleophilia", "Thermoleophil…
$ order                 <chr> "Terriglobales", "Solirubrobacterales", "Solirub…
$ family                <chr> "SbA1", "Solirubrobacteraceae", "Solirubrobacter…
$ genus                 <chr> "Sulfotelmatobacter", "Palsa-744", "Palsa-744", …
$ bin_methods           <chr> "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0…
$ created_by            <chr> "IMG_PIPELINE", "IMG_PIPELINE", "IMG_PIPELINE", …
$ date_added            <date> 2025-02-03, 2025-02-03, 2025-02-03, 2025-02-03,…
$ bin_completeness      <dbl> 95.61, 94.66, 51.78, 60.83, 81.54, 88.11, 77.19,…
$ bin_contamination     <dbl> 4.96, 1.99, 9.42, 0.04, 0.18, 0.19, 4.25, 0.66, …
$ average_coverage      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ total_number_of_bases <dbl> 6486507, 3237739, 2712910, 2720369, 3539081, 395…
$ x5s_r_rna             <dbl> 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, …
$ x16s_r_rna            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
$ x23s_r_rna            <dbl> 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, …
$ t_rna_genes           <dbl> 54, 51, 30, 10, 30, 29, 36, 21, 24, 37, 20, 21, …
$ gene_count            <dbl> 5785, 3054, 3039, 2539, 3314, 3753, 5943, 2297, …
$ scaffold_count        <dbl> 100, 52, 513, 152, 245, 91, 497, 188, 470, 227, …
$ gold_study_id         <chr> "Gs0166454", "Gs0166454", "Gs0166454", "Gs016645…
$ community             <chr> "Soil microbial communities", "Soil microbial co…
$ type                  <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "Soil", …

I started by asking co-pilot for help with making a stacked bar graph looking at the relative abundance of each phylum of bacteria.

In this code, we take taxonomic data count the abundance of each phylum at the different sites, calculate the relative abundance of phylum’s at each site, and visualize it through a stacked bar graph.

#MAKING A RELATIVE ABUNDANCE GRAPH
library(dplyr)
library(tidyr)
library(ggplot2)

# --- Parse taxonomy ---
taxonomy_split <- df %>%
  separate(gtdb_taxonomy_lineage,
           into = c("Domain","Phylum","Class","Order","Family","Genus"),
           sep = ";", fill = "right", remove = FALSE)

# --- Summarize composition at Phylum level per Site ---
phylum_counts <- taxonomy_split %>%
  group_by(site, Phylum) %>%
  summarise(total_abundance = n(), .groups = "drop") %>%
  group_by(site) %>%
  mutate(RelativeAbundance = total_abundance / sum(total_abundance)) %>%
  ungroup()

# --- Plot stacked bar chart ---
ggplot(phylum_counts, aes(x = site, y = RelativeAbundance, fill = Phylum)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Relative Abundance of Phyla Across Sites",
       x = "Site",
       y = "Relative Abundance") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Making a Heat Map of Relative Abundance

In effort to visualize our newly transformed data set in a new way, I asked Co-Pilot to create an R-script for a heat map showing the relative abundance of each phylum. Here we are taking the same transformed data set from before, but just changing the visualization.

library(ggplot2)

ggplot(phylum_counts, aes(x = site, y = Phylum, fill = RelativeAbundance)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal() +
  labs(title = "Heatmap of Phylum Relative Abundance",
       x = "Site",
       y = "Phylum",
       fill = "Rel. Abundance")

New Ideas:

After creating these two figures, I wanted to explore the different analyses that are possible with a large data set like this.

I re-uploaded the csv file to copilot and asked: What types of analyses in R should I run with this data set?

In summary, Co-pilot responded with :

Microbial Community Analyses

  • Taxonomic composition

    • Summarize counts or relative abundances at different taxonomic levels (phylum, genus).

    • Visualize with stacked bar plots, pie charts, or heatmaps.

  • Diversity metrics

    • Calculate alpha diversity (Shannon, Simpson) per site.

    • Compare beta diversity (Bray–Curtis, Jaccard) between sites.

    • Packages: vegan, phyloseq.

Environmental & Geographic Associations

  • Ordination methods

    • PCA, NMDS, or PCoA to see clustering of sites by taxonomy or soil chemistry.

    • Canonical correspondence analysis (CCA) or redundancy analysis (RDA) to link microbial composition with soil variables.

  • Regression models

    • Linear or generalized linear models to test how soil chemistry (pH, nutrients, C:N ratio) predicts microbial diversity or abundance.
  • Spatial visualization

    • Map MAG distributions across sites using ggplot2 + sf.

Functional & Ecological Insights

  • Indicator taxa analysis

    • Identify taxa strongly associated with particular soil conditions.
  • Co-occurrence networks

    • Build correlation networks of MAGs to explore microbial interactions.

    • Package: igraph.

  • Trait-based analysis

    • If functional annotations are available, compare metabolic potential across environments.

Data Exploration & Visualization

  • Heatmaps

    • Show abundance of taxa across sites with soil chemistry gradients.
  • Boxplots/violin plots

    • Compare diversity or abundance across categorical variables (soil type, biome).
  • Correlation matrices

    • Explore relationships among soil chemistry variables and microbial metrics.

All of this was really useful information and allowed to build a framework for the different analyses I wanted to complete using this MAG data set.

I next had Co-pilot conduct a step by step work flow to do the previously mentioned analyses starting from the beginning with loading in the csv file.

Setup and Data Loading

Here I just want to load all of the packages into R and check that the column names and types are all correct.

# Core packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(forcats)

# Ecology and ordination
library(vegan)

# Read data
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)


# Quick checks
names(df)
 [1] "X"                     "genomicsSampleID"      "soilMoisture"         
 [4] "decimalLatitude"       "decimalLongitude"      "elevation"            
 [7] "soilTemp"              "soilInWaterpH"         "soilInCaClpH"         
[10] "bin_oid"               "bin_id"                "site"                 
[13] "site_ID"               "subplot"               "layer"                
[16] "quadrant"              "date"                  "img_genome_id"        
[19] "bin_quality"           "bin_lineage"           "gtdb_taxonomy_lineage"
[22] "domain"                "phylum"                "class"                
[25] "order"                 "family"                "genus"                
[28] "bin_methods"           "created_by"            "date_added"           
[31] "bin_completeness"      "bin_contamination"     "average_coverage"     
[34] "total_number_of_bases" "x5s_r_rna"             "x16s_r_rna"           
[37] "x23s_r_rna"            "t_rna_genes"           "gene_count"           
[40] "scaffold_count"        "gold_study_id"         "community"            
[43] "type"                 
str(df)
'data.frame':   5573 obs. of  43 variables:
 $ X                    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ genomicsSampleID     : chr  "BONA_001-O-20230710" "BONA_001-O-20230710" "BONA_001-O-20230710" "BONA_001-O-20230710" ...
 $ soilMoisture         : num  1.55 1.55 1.55 1.55 1.55 ...
 $ decimalLatitude      : num  65.2 65.2 65.2 65.2 65.2 ...
 $ decimalLongitude     : num  -147 -147 -147 -147 -147 ...
 $ elevation            : num  374 374 374 374 374 ...
 $ soilTemp             : num  8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 8.1 ...
 $ soilInWaterpH        : num  3.99 3.99 3.99 3.99 3.99 ...
 $ soilInCaClpH         : num  3.45 3.45 3.45 3.45 3.45 ...
 $ bin_oid              : chr  "3300079481_s0" "3300079481_s1" "3300079481_s101" "3300079481_s11" ...
 $ bin_id               : chr  "3300079481_s0" "3300079481_s1" "3300079481_s101" "3300079481_s11" ...
 $ site                 : chr  "Caribou-Poker Creeks Research Watershed NEON Field Site, Alaska, USA" "Caribou-Poker Creeks Research Watershed NEON Field Site, Alaska, USA" "Caribou-Poker Creeks Research Watershed NEON Field Site, Alaska, USA" "Caribou-Poker Creeks Research Watershed NEON Field Site, Alaska, USA" ...
 $ site_ID              : chr  "BONA" "BONA" "BONA" "BONA" ...
 $ subplot              : int  1 1 1 1 1 1 1 1 1 1 ...
 $ layer                : chr  "O" "O" "O" "O" ...
 $ quadrant             : logi  NA NA NA NA NA NA ...
 $ date                 : int  20230710 20230710 20230710 20230710 20230710 20230710 20230710 20230710 20230710 20230710 ...
 $ img_genome_id        : num  3.3e+09 3.3e+09 3.3e+09 3.3e+09 3.3e+09 ...
 $ bin_quality          : chr  "MQ" "HQ" "MQ" "MQ" ...
 $ bin_lineage          : chr  "Bacteria; Acidobacteriota; Terriglobia" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales" "Bacteria; Acidobacteriota; Terriglobia; Terriglobales" ...
 $ gtdb_taxonomy_lineage: chr  "Bacteria; Acidobacteriota; Terriglobia; Terriglobales; SbA1; Sulfotelmatobacter" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales; Solirubrobacteraceae; Palsa-744" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales; Solirubrobacteraceae; Palsa-744" "Bacteria; Acidobacteriota; Terriglobia; Terriglobales; SbA1; Sulfotelmatobacter" ...
 $ domain               : chr  "Bacteria" "Bacteria" "Bacteria" "Bacteria" ...
 $ phylum               : chr  "Acidobacteriota" "Actinomycetota" "Actinomycetota" "Acidobacteriota" ...
 $ class                : chr  "Terriglobia" "Thermoleophilia" "Thermoleophilia" "Terriglobia" ...
 $ order                : chr  "Terriglobales" "Solirubrobacterales" "Solirubrobacterales" "Terriglobales" ...
 $ family               : chr  "SbA1" "Solirubrobacteraceae" "Solirubrobacteraceae" "SbA1" ...
 $ genus                : chr  "Sulfotelmatobacter" "Palsa-744" "Palsa-744" "Sulfotelmatobacter" ...
 $ bin_methods          : chr  "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" ...
 $ created_by           : chr  "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" ...
 $ date_added           : chr  "2025-02-03" "2025-02-03" "2025-02-03" "2025-02-03" ...
 $ bin_completeness     : num  95.6 94.7 51.8 60.8 81.5 ...
 $ bin_contamination    : num  4.96 1.99 9.42 0.04 0.18 0.19 4.25 0.66 0.16 3.17 ...
 $ average_coverage     : logi  NA NA NA NA NA NA ...
 $ total_number_of_bases: int  6486507 3237739 2712910 2720369 3539081 3957793 6054409 2329620 4300045 4009333 ...
 $ x5s_r_rna            : int  1 1 1 0 0 1 0 0 0 0 ...
 $ x16s_r_rna           : int  1 1 1 0 0 0 0 0 0 0 ...
 $ x23s_r_rna           : int  0 2 1 0 0 0 0 0 0 0 ...
 $ t_rna_genes          : int  54 51 30 10 30 29 36 21 24 37 ...
 $ gene_count           : int  5785 3054 3039 2539 3314 3753 5943 2297 4305 3695 ...
 $ scaffold_count       : int  100 52 513 152 245 91 497 188 470 227 ...
 $ gold_study_id        : chr  "Gs0166454" "Gs0166454" "Gs0166454" "Gs0166454" ...
 $ community            : chr  "Soil microbial communities" "Soil microbial communities" "Soil microbial communities" "Soil microbial communities" ...
 $ type                 : chr  "Soil" "Soil" "Soil" "Soil" ...

Cleaning and Taxonomy Parsing

Here I am separating the taxonomy into levels and cleaning up the data.

# Trim whitespace in taxonomy strings (helps avoid duplicate labels)
df <- df %>%
  mutate(gtdb_taxonomy_lineage = trimws(gtdb_taxonomy_lineage))

# Parse taxonomy into levels
taxonomy_split <- df %>%
  separate(
    gtdb_taxonomy_lineage,
    into = c("Domain","Phylum","Class","Order","Family","Genus"),
    sep = ";",
    fill = "right",
    remove = FALSE
  ) %>%
  mutate(
    Domain = trimws(Domain),
    Phylum = trimws(Phylum),
    Class  = trimws(Class),
    Order  = trimws(Order),
    Family = trimws(Family),
    Genus  = trimws(Genus)
  )

# Optional: clean prefixes like "d__", "p__", etc.
strip_prefix <- function(x) sub("^[a-z]__", "", x)
taxonomy_split <- taxonomy_split %>%
  mutate(
    Domain = strip_prefix(Domain),
    Phylum = strip_prefix(Phylum),
    Class  = strip_prefix(Class),
    Order  = strip_prefix(Order),
    Family = strip_prefix(Family),
    Genus  = strip_prefix(Genus)
  )

Build site-by-taxon matrix

Similar to the previous labs workflow, Co-pilot had me start by calculating the relative abundance in a site-by-taxon matrix. Here I am calculating the relative abundance aka the number of MAGs per phylum per site.

Counts if no abundance column present

# Site × Phylum counts (number of MAGs per phylum per site)

phylum_abundance <- taxonomy_split %>%
  group_by(site, Phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = Phylum, values_from = count, values_fill = 0)

# Ensure Site is kept as an ID
phylum_abundance <- phylum_abundance %>% arrange(site)

Relative abundance per site

# Convert to long and compute within-site relative abundance
phylum_long <- phylum_abundance %>%
  pivot_longer(-site, names_to = "Phylum", values_to = "Count") %>%
  group_by(site) %>%
  mutate(RelAbund = Count / sum(Count)) %>%
  ungroup()

Stacked bars of relative abundance

Here I am building another stacked bar graph using the calculated relative abundance with the cleaned up data set. For the second part of this process, I am visualizing this data with ggplot.

# Order phyla by overall mean abundance for consistent coloring
phylum_order <- phylum_long %>%
  group_by(Phylum) %>%
  summarise(mean_rel = mean(RelAbund)) %>%
  arrange(desc(mean_rel)) %>%
  pull(Phylum)

phylum_long$Phylum <- factor(phylum_long$Phylum, levels = phylum_order)

ggplot(phylum_long, aes(x = site, y = RelAbund, fill = Phylum)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Relative abundance of phyla across sites", y = "Relative abundance", x = "Site") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Diversity metrics (alpha and beta)

In effort to better understand the diversity at each site, we can calculate the alpha diversity (Shannon/Simpson) as well as the beta diversity (Bray-Curtis). From Co-pilot, I learned that the alpha diversity measures the diversity or richness within a single sample. The Shannon index is important for understand how abundance is distributes across all the species at a site. The Simpson index is useful for highlight the dominance of a few species at a site.

# Alpha diversity (Shannon, Simpson) using counts
comm_matrix <- phylum_abundance %>% select(-site) %>% as.matrix()

alpha_df <- data.frame(
  site = phylum_abundance$site,
  Shannon = diversity(comm_matrix, index = "shannon"),
  Simpson = diversity(comm_matrix, index = "simpson")
)

# Beta diversity (Bray-Curtis)
bray <- vegdist(comm_matrix, method = "bray")

Co-Pilot’s Notes on Interpretation: Higher Shannon/Simpson indicate more diverse phylum composition per site.

Alpha diversity box plots by a categorical variable

This next code chunk takes the previous information we collected about the alpha diversity aka the Shannon/Simpson values and visualizes the data in a box plot so we can see the distribution.

# Example: biome or soil type (adjust column name)
alpha_plot_df <- alpha_df %>%
  left_join(df %>% select(site, phylum) %>% distinct(), by = "site")

ggplot(alpha_plot_df, aes(x = phylum, y = Shannon)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Shannon diversity by soil type", x = "Soil type", y = "Shannon")

ggplot(alpha_plot_df, aes(x = phylum, y = Simpson)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Simpson diversity by soil type", x = "Soil type", y = "Simpson")

NMDS (robust for community data) Visualizing the Beta Diversity.

To understand the Beta diversity information, we can transform this data and visualize it different ways. The first code chunk was my first iteration of the NMDS plot which is basically just a plot that reduces complex data ina 2D map.

set.seed(42)
nmds <- metaMDS(comm_matrix, distance = "bray", k = 2, trymax = 100)
Square root transformation
Wisconsin double standardization
Run 0 stress 9.268831e-05 
Run 1 stress 5.16498e-05 
... New best solution
... Procrustes: rmse 9.272703e-05  max resid 0.000176988 
... Similar to previous best
Run 2 stress 9.35144e-05 
... Procrustes: rmse 7.542616e-05  max resid 0.000161206 
... Similar to previous best
Run 3 stress 9.734344e-05 
... Procrustes: rmse 4.05373e-05  max resid 7.682049e-05 
... Similar to previous best
Run 4 stress 9.807359e-05 
... Procrustes: rmse 7.878789e-05  max resid 0.0001743309 
... Similar to previous best
Run 5 stress 8.347793e-05 
... Procrustes: rmse 3.662838e-05  max resid 6.137061e-05 
... Similar to previous best
Run 6 stress 9.699531e-05 
... Procrustes: rmse 9.431715e-05  max resid 0.0001809109 
... Similar to previous best
Run 7 stress 9.795331e-05 
... Procrustes: rmse 7.694697e-05  max resid 0.0001548931 
... Similar to previous best
Run 8 stress 9.523465e-05 
... Procrustes: rmse 6.634562e-05  max resid 0.000137503 
... Similar to previous best
Run 9 stress 6.725938e-05 
... Procrustes: rmse 3.725153e-05  max resid 7.864607e-05 
... Similar to previous best
Run 10 stress 7.635622e-05 
... Procrustes: rmse 3.298862e-05  max resid 9.228987e-05 
... Similar to previous best
Run 11 stress 9.070719e-05 
... Procrustes: rmse 8.712375e-05  max resid 0.0001687024 
... Similar to previous best
Run 12 stress 8.517662e-05 
... Procrustes: rmse 4.600564e-05  max resid 7.469896e-05 
... Similar to previous best
Run 13 stress 9.185952e-05 
... Procrustes: rmse 3.851356e-05  max resid 9.393277e-05 
... Similar to previous best
Run 14 stress 8.868226e-05 
... Procrustes: rmse 4.110068e-05  max resid 8.494076e-05 
... Similar to previous best
Run 15 stress 9.0247e-05 
... Procrustes: rmse 7.561392e-05  max resid 0.0001621105 
... Similar to previous best
Run 16 stress 8.19539e-05 
... Procrustes: rmse 4.222065e-05  max resid 7.218457e-05 
... Similar to previous best
Run 17 stress 5.77156e-05 
... Procrustes: rmse 3.584369e-05  max resid 6.278111e-05 
... Similar to previous best
Run 18 stress 9.30164e-05 
... Procrustes: rmse 4.718912e-05  max resid 8.801345e-05 
... Similar to previous best
Run 19 stress 9.624154e-05 
... Procrustes: rmse 4.460785e-05  max resid 7.929526e-05 
... Similar to previous best
Run 20 stress 9.371389e-05 
... Procrustes: rmse 9.333008e-05  max resid 0.0001879802 
... Similar to previous best
*** Best solution repeated 20 times
nmds_scores <- as.data.frame(nmds$points)
nmds_scores$site <- phylum_abundance$site

ggplot(nmds_scores, aes(x = MDS1, y = MDS2)) +
  geom_point(size = 3, alpha = 0.8, color = "tomato") +
  theme_minimal() +
  labs(title = "NMDS (Bray-Curtis) of sites by phylum composition")

In this second code chunk, I asked co-pilot to help me visualize this data in a different way using a heat map to show the difference between each of the sites.

Here are some notes from Co-Pilot:

What this does

  • NMDS plot: Shows sites positioned in 2D space based on Bray–Curtis dissimilarities. Sites closer together have more similar phylum compositions.

  • Heatmap: Shows pairwise dissimilarities between all sites. Darker colors = more dissimilar.

#Load Libraries
library(vegan)
library(ggplot2)
library(reshape2)

# --- Alpha diversity (already computed) ---
comm_matrix <- phylum_abundance %>% select(-site) %>% as.matrix()


alpha_df <- data.frame(
  site = phylum_abundance$site,
  Shannon = diversity(comm_matrix, index = "shannon"),
  Simpson = diversity(comm_matrix, index = "simpson")
)

# --- Beta diversity (Bray-Curtis) ---
bray <- vegdist(comm_matrix, method = "bray")

# --- Option 2: Heatmap of Bray-Curtis distances ---
bray_matrix <- as.matrix(bray)
rownames(bray_matrix) <- phylum_abundance$site
colnames(bray_matrix) <- phylum_abundance$site

# Melt for ggplot
bray_melt <- melt(bray_matrix)

ggplot(bray_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_viridis_c(option = "plasma") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Bray-Curtis Dissimilarity Heatmap",
       x = "Site", y = "Site", fill = "Dissimilarity")

Ordination (PCA)

To see if there is any clustering of sites by taxonomy or other variables, we can use a PCA plot to do so.

In order to make a PCA we need to prepare the data. I asked Co-Pilot to help me prep the MAG data for the PCA plot.

Ensure numeric matrix: Convert all taxa columns to numeric, replace NA with 0 before ordination.

comm_matrix <- phylum_abundance %>% select(-site) %>% mutate(across(everything(), as.numeric)) %>% as.matrix()
comm_matrix[is.na(comm_matrix)] <- 0

I first start this PC analysis by doing a Hellinger transofrmation on this data set. This transformation refers to data that has been adjusted by taking the square root of each species relative abundance within a sample to make it suitable for linear ordination aka PCA.

PCA on Hellinger-transformed data

# Hellinger transformation (recommended for community composition)
comm_hell <- decostand(comm_matrix, method = "hellinger")

# PCA
pca_res <- rda(comm_hell)  # PCA via RDA on Hellinger data
pca_scores <- scores(pca_res, display = "sites") %>% as.data.frame()
pca_scores$site <- phylum_abundance$site

# Plot PCA
var_exp <- summary(pca_res)$cont$importance[2, 1:2] * 100

ggplot(pca_scores, aes(x = PC1, y = PC2, label = site)) +
  geom_point(size = 3, alpha = 0.8, color = "steelblue") +
  geom_text(vjust = -0.6, size = 2) +
  theme_minimal() +
  labs(
    title = "PCA of sites by phylum composition (Hellinger)",
    x = paste0("PC1 (", round(var_exp[1], 1), "%)"),
    y = paste0("PC2 (", round(var_exp[2], 1), "%)")
  )

Notes from Copilot: PCA is good for linear structure; NMDS handles non-linear relationships and sparse data well.

Environmental associations (PERMANOVA, envfit, RDA/CCA)

Next I wanted to see if we could use this data set to make certain associations between different soil variables. In order to do this there a different analyses in R that Co-Pilot showed me.

First the ai had me prepare the metadata for these different analyses.

Prepare environmental metadata

# Select soil chemistry columns (adjust names)
env <- df %>%
  select(site, soilInWaterpH, soilMoisture, soilTemp, soilInCaClpH) %>%
  distinct(site, .keep_all = TRUE)

# Join to match community rows
env <- phylum_abundance %>%
  select(site) %>%
  left_join(env, by = "site")

# Handle missing values conservatively
env_clean <- env %>%
  mutate(across(where(is.numeric), ~ ifelse(is.infinite(.), NA, .))) %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

A PERMANOVA is a Permutational Multivariate Analysis of Variance which is a statistical test used to compare groups in a complex multivariate data set and is very commonly used for ecological data.

PERMANOVA: Do soil variables explain community differences?

# Make sure row order of env_clean matches comm_matrix
stopifnot(all(env_clean$Site == phylum_abundance$site))

# Example model (adjust variables)
perm <- adonis2(bray ~ soilInWaterpH + soilMoisture + soilTemp + soilInCaClpH , data = env_clean)
perm
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999

adonis2(formula = bray ~ soilInWaterpH + soilMoisture + soilTemp + soilInCaClpH, data = env_clean)
              Df SumOfSqs      R2      F Pr(>F)    
soilInWaterpH  1   0.9818 0.19724 7.7045  0.001 ***
soilMoisture   1   0.0503 0.01011 0.3951  0.850    
soilTemp       1   0.1300 0.02613 1.0205  0.409    
soilInCaClpH   1   0.3748 0.07530 2.9414  0.019 *  
Residual      27   3.4408 0.69122                  
Total         31   4.9779 1.00000                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From this PERMANOVA we can see that the only variable that significantly explains difference in the community composition is the soilInCaClpH with a p value of 0.019.

Envfit: overlay environmental vectors on ordination

Next we used Environmental Fitting which is a part of the vegan package to fit this different variables like pH, temperature and soil type as vectors on an ordination plot aka PCA/NMDS.

# Fit onto PCA space
fit <- envfit(pca_res, env_clean %>% select(-site), permutations = 999)

# Extract significant vectors
sig_vecs <- as.data.frame(fit$vectors$arrows) %>%
  mutate(var = rownames(fit$vectors$arrows),
         pval = fit$vectors$pvals) %>%
  filter(pval <= 0.05)
sig_vecs
                     PC1       PC2           var  pval
soilInWaterpH -0.8729143 0.4878736 soilInWaterpH 0.001
soilInCaClpH  -0.8280194 0.5606994  soilInCaClpH 0.001

As we can see here, through this analysis, the soilInWaterpH and the soilInCaClpH had a pval of 0.001, meaning that these variables are potentially contributing to differences seen among the sites.

Notes From Copilot On How To Tidy Data…

Code top Reduce sparsity: Consider filtering phyla with very low totals to reduce noise.

keep <- colSums(comm_matrix) > 2
comm_matrix <- comm_matrix[, keep]

Save Out Put!

write.csv(alpha_df, "alpha_diversity.csv", row.names = FALSE)
ggsave("stacked_relative_abundance.png", width = 10, height = 6, dpi = 300)
ggsave("pca_phylum_hellinger.png", width = 8, height = 6, dpi = 300)
ggsave("nmds_bray.png", width = 8, height = 6, dpi = 300)

After completing these different analyses, I wanted to make a graph that was informative but more interactive. I remembered one the previous assignments from the R workbook where we made an interactive graph using a map of the United States. Using Co-Pilot, I was able to apply the data from this MAG data set to an interactive map.

Interactive Map Perfect In R that shows relative abundance values at each site.

First I wanted to see the relative abundance by phylum at each of the sites, a simple proportion of MAGS per phylum.

# Install if needed
# install.packages("leaflet")
# install.packages("dplyr")

library(leaflet)
library(dplyr)

# --- Step 1: Read in your dataset ---
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)

# --- Step 2: Summarize relative abundance per site × phylum ---
# If you don’t have an explicit abundance column, we’ll use counts of MAGs per phylum
site_summary <- df %>%
  group_by(site, decimalLatitude, decimalLongitude, phylum) %>%
  summarise(RelAbund = n() / nrow(df), .groups = "drop")

# --- Step 3: Define color palette for phyla ---
pal <- colorFactor(palette = "Set1", domain = site_summary$phylum)

# --- Step 4: Build interactive map ---
leaflet(site_summary) %>%
  addProviderTiles("CartoDB.Positron") %>%   # nice basemap
  addCircleMarkers(
    lng = ~decimalLongitude,
    lat = ~decimalLatitude,
    radius = ~RelAbund * 1000,   # scale circle size by relative abundance
    color = ~pal(phylum),
    stroke = FALSE,
    fillOpacity = 0.7,
    popup = ~paste0(
      "<b>Site:</b> ", site, "<br>",
      "<b>Phylum:</b> ", phylum, "<br>",
      "<b>Relative Abundance:</b> ", round(RelAbund, 3)
    )
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~phylum,
    title = "Phylum",
    opacity = 1
  )

Interactive Map Showing the relative abundance within each site

In this interactive graph, we can see the relative abundance within each site.

# Install if needed
# install.packages("leaflet")
# install.packages("dplyr")

library(leaflet)
library(dplyr)

# --- Step 1: Read in your dataset ---
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)

# --- Step 2: Summarize relative abundance per site × phylum ---
# If you don’t have an explicit abundance column, we’ll use counts of MAGs per phylum
site_summary <- df %>%
  group_by(site, decimalLatitude, decimalLongitude, phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(site) %>%
  mutate(RelAbund = count / sum(count)) %>%
  ungroup()

# --- Step 3: Define color palette for phyla ---
pal <- colorFactor(palette = "Set1", domain = site_summary$phylum)

# --- Step 4: Build interactive map ---
leaflet(site_summary) %>%
  addProviderTiles("CartoDB.Positron") %>%   # nice basemap
  addCircleMarkers(
    lng = ~decimalLongitude,
    lat = ~decimalLatitude,
    radius = ~RelAbund * 50,   # scale circle size by relative abundance
    color = ~pal(phylum),
    stroke = FALSE,
    fillOpacity = 0.7,
    popup = ~paste0(
      "<b>Site:</b> ", site, "<br>",
      "<b>Phylum:</b> ", phylum, "<br>",
      "<b>Relative Abundance:</b> ", round(RelAbund, 3)
    )
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~phylum,
    title = "Phylum",
    opacity = 1
  )

Interactive Map Showing the Dominant Phylum per site

(Circle Size Reflects Raw counts)

library(leaflet)
library(dplyr)

# --- Step 1: Read in your dataset ---
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)

# --- Step 2: Compute dominant phylum per site ---
dominant_phylum <- df %>%
  group_by(site, decimalLatitude, decimalLongitude, phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(site) %>%
  slice_max(order_by = count, n = 1, with_ties = FALSE) %>%   # keep only one phylum per site
  ungroup()

# --- Step 3: Define color palette for phyla ---
pal <- colorFactor(palette = "Set1", domain = dominant_phylum$phylum)

# --- Step 4: Build interactive map ---
leaflet(dominant_phylum) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~decimalLongitude,
    lat = ~decimalLatitude,
    radius = ~count * 0.5,   # scale circle size by number of MAGs in dominant phylum
    color = ~pal(phylum),
    stroke = FALSE,
    fillOpacity = 0.7,
    popup = ~paste0(
      "<b>Site:</b> ", site, "<br>",
      "<b>Dominant Phylum:</b> ", phylum, "<br>",
      "<b>Count:</b> ", count
    )
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~phylum,
    title = "Dominant Phylum",
    opacity = 1
  )

Interactive Map Showing the Dominant Phylum per site

In this interactive graph, co-pilot suggest that we aggregate the data by dominant phylum per site so at each site one single cirle represents the dominat phylum. These circles also reflect raw counts.

library(leaflet)
library(dplyr)

# --- Step 1: Read in your dataset ---
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)

# --- Step 2: Compute relative abundance per site × phylum ---
site_summary <- df %>%
  group_by(site, decimalLatitude, decimalLongitude, phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(site) %>%
  mutate(RelAbund = count / sum(count)) %>%
  ungroup()

# --- Step 3: Extract dominant phylum per site (highest relative abundance) ---
dominant_phylum <- site_summary %>%
  group_by(site) %>%
  slice_max(order_by = RelAbund, n = 1, with_ties = FALSE) %>%
  ungroup()

# --- Step 4: Define color palette for phyla ---
pal <- colorFactor(palette = "Set1", domain = dominant_phylum$phylum)

# --- Step 5: Build interactive map ---
leaflet(dominant_phylum) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~decimalLongitude,
    lat = ~decimalLatitude,
    radius = ~RelAbund * 50,   # circle size scaled by relative abundance
    color = ~pal(phylum),
    stroke = FALSE,
    fillOpacity = 0.7,
    popup = ~paste0(
      "<b>Site:</b> ", site, "<br>",
      "<b>Dominant Phylum:</b> ", phylum, "<br>",
      "<b>Relative Abundance:</b> ", round(RelAbund, 3)
    )
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~phylum,
    title = "Dominant Phylum",
    opacity = 1
  )

In this next iteration of the interactive graph, I wanted to add back the relative abundance information rather than the raw counts and also make it so you could chose which phylum you are looking at. I prompted co-pilot with my requests and was able to generate the following graph!

library(leaflet)
library(dplyr)
library(leaflet.extras)

# --- Step 1: Read in your dataset ---
df <- read.csv("NEON_soilMAGs_soilChem.csv", header = TRUE, stringsAsFactors = FALSE)

# --- Step 2: Compute relative abundance per site × phylum ---
site_summary <- df %>%
  group_by(site, decimalLatitude, decimalLongitude, phylum) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(site) %>%
  mutate(RelAbund = count / sum(count)) %>%
  ungroup()

# --- Step 3: Extract dominant phylum per site ---
dominant_phylum <- site_summary %>%
  group_by(site) %>%
  slice_max(order_by = RelAbund, n = 1, with_ties = FALSE) %>%
  ungroup()

# --- Step 4: Define color palette ---
pal <- colorFactor(palette = "Set1", domain = dominant_phylum$phylum)

# --- Step 5: Build interactive map with legend + filter ---
leaflet(dominant_phylum) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~decimalLongitude,
    lat = ~decimalLatitude,
    radius = ~RelAbund * 50,
    color = ~pal(phylum),
    stroke = FALSE,
    fillOpacity = 0.7,
    popup = ~paste0(
      "<b>Site:</b> ", site, "<br>",
      "<b>Dominant Phylum:</b> ", phylum, "<br>",
      "<b>Relative Abundance:</b> ", round(RelAbund, 3)
    ),
    group = ~phylum   # group markers by phylum for filtering
  ) %>%
  addLegend(
    "bottomright",
    pal = pal,
    values = ~phylum,
    title = "Dominant Phylum",
    opacity = 1
  ) %>%
  addLayersControl(
    overlayGroups = unique(dominant_phylum$phylum),
    options = layersControlOptions(collapsed = FALSE)
  )

In this graph…

group = ~phylum: assigns each marker to a group based on its phylum.

  • addLayersControl(): adds a control box so you can toggle phyla on/off interactively.

  • Legend: still shows the color mapping for phyla.

Happy Coding and Good Luck!